{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/7c/tx2rnzzj2_x546cjtv31_6yr0000gn/T/jieba.cache\n",
      "Loading model cost 1.631 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\n",
    "import jieba\n",
    "import jieba.analyse\n",
    "\n",
    "with open('./zztj.txt') as f:\n",
    "    document = f.read()\n",
    "    \n",
    "    #document_decode = document.decode('GBK')\n",
    "    \n",
    "    document_cut = jieba.cut(document)\n",
    "    #print ( ' '.join(document_cut))  #如果打印结果，则分词效果消失，后面的result无法显示\n",
    "    result = ' '.join(document_cut)\n",
    "    #result = result.encode('utf-8')\n",
    "    with open('./zztj_segment.txt', 'w') as f2:\n",
    "        f2.write(result)\n",
    "f.close()\n",
    "f2.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-03-16 19:46:04,340 : INFO : collecting all words and their counts\n",
      "2020-03-16 19:46:04,376 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
      "2020-03-16 19:46:04,496 : INFO : PROGRESS: at sentence #10000, processed 199274 words, keeping 28745 word types\n",
      "2020-03-16 19:46:04,590 : INFO : PROGRESS: at sentence #20000, processed 397588 words, keeping 48194 word types\n",
      "2020-03-16 19:46:04,695 : INFO : PROGRESS: at sentence #30000, processed 597685 words, keeping 64009 word types\n",
      "2020-03-16 19:46:04,801 : INFO : PROGRESS: at sentence #40000, processed 795048 words, keeping 79627 word types\n",
      "2020-03-16 19:46:04,893 : INFO : PROGRESS: at sentence #50000, processed 989389 words, keeping 93505 word types\n",
      "2020-03-16 19:46:04,977 : INFO : PROGRESS: at sentence #60000, processed 1193064 words, keeping 107948 word types\n",
      "2020-03-16 19:46:05,058 : INFO : PROGRESS: at sentence #70000, processed 1392525 words, keeping 121970 word types\n",
      "2020-03-16 19:46:05,141 : INFO : PROGRESS: at sentence #80000, processed 1592220 words, keeping 135040 word types\n",
      "2020-03-16 19:46:05,222 : INFO : PROGRESS: at sentence #90000, processed 1788516 words, keeping 146953 word types\n",
      "2020-03-16 19:46:05,301 : INFO : PROGRESS: at sentence #100000, processed 1984122 words, keeping 157352 word types\n",
      "2020-03-16 19:46:05,385 : INFO : PROGRESS: at sentence #110000, processed 2174577 words, keeping 167553 word types\n",
      "2020-03-16 19:46:05,473 : INFO : PROGRESS: at sentence #120000, processed 2366935 words, keeping 177973 word types\n",
      "2020-03-16 19:46:05,565 : INFO : PROGRESS: at sentence #130000, processed 2565930 words, keeping 189368 word types\n",
      "2020-03-16 19:46:05,654 : INFO : PROGRESS: at sentence #140000, processed 2762821 words, keeping 200151 word types\n",
      "2020-03-16 19:46:05,737 : INFO : PROGRESS: at sentence #150000, processed 2957483 words, keeping 210130 word types\n",
      "2020-03-16 19:46:05,816 : INFO : PROGRESS: at sentence #160000, processed 3151576 words, keeping 220066 word types\n",
      "2020-03-16 19:46:05,897 : INFO : PROGRESS: at sentence #170000, processed 3348480 words, keeping 229231 word types\n",
      "2020-03-16 19:46:05,980 : INFO : PROGRESS: at sentence #180000, processed 3541298 words, keeping 238183 word types\n",
      "2020-03-16 19:46:06,063 : INFO : PROGRESS: at sentence #190000, processed 3731414 words, keeping 246828 word types\n",
      "2020-03-16 19:46:06,181 : INFO : PROGRESS: at sentence #200000, processed 3926282 words, keeping 255477 word types\n",
      "2020-03-16 19:46:06,299 : INFO : PROGRESS: at sentence #210000, processed 4126206 words, keeping 264242 word types\n",
      "2020-03-16 19:46:06,429 : INFO : PROGRESS: at sentence #220000, processed 4320481 words, keeping 272353 word types\n",
      "2020-03-16 19:46:06,548 : INFO : PROGRESS: at sentence #230000, processed 4512238 words, keeping 280533 word types\n",
      "2020-03-16 19:46:06,689 : INFO : PROGRESS: at sentence #240000, processed 4701580 words, keeping 288698 word types\n",
      "2020-03-16 19:46:06,784 : INFO : PROGRESS: at sentence #250000, processed 4893946 words, keeping 296693 word types\n",
      "2020-03-16 19:46:06,881 : INFO : PROGRESS: at sentence #260000, processed 5084019 words, keeping 304617 word types\n",
      "2020-03-16 19:46:06,959 : INFO : collected 310159 word types from a corpus of 5226851 raw words and 265510 sentences\n",
      "2020-03-16 19:46:06,961 : INFO : Loading a fresh vocabulary\n",
      "2020-03-16 19:46:08,424 : INFO : effective_min_count=1 retains 310159 unique words (100% of original 310159, drops 0)\n",
      "2020-03-16 19:46:08,425 : INFO : effective_min_count=1 leaves 5226851 word corpus (100% of original 5226851, drops 0)\n",
      "2020-03-16 19:46:09,268 : INFO : deleting the raw counts dictionary of 310159 items\n",
      "2020-03-16 19:46:09,275 : INFO : sample=0.001 downsamples 32 most-common words\n",
      "2020-03-16 19:46:09,276 : INFO : downsampling leaves estimated 4128428 word corpus (79.0% of prior 5226851)\n",
      "2020-03-16 19:46:09,620 : INFO : constructing a huffman tree from 310159 words\n",
      "2020-03-16 19:46:19,744 : INFO : built huffman tree with maximum node depth 22\n",
      "2020-03-16 19:46:20,341 : INFO : estimated required memory for 310159 words and 100 dimensions: 589302100 bytes\n",
      "2020-03-16 19:46:20,342 : INFO : resetting layer weights\n",
      "2020-03-16 19:47:29,257 : INFO : training model with 3 workers on 310159 vocabulary and 100 features, using sg=0 hs=1 sample=0.001 negative=5 window=3\n",
      "2020-03-16 19:47:30,283 : INFO : EPOCH 1 - PROGRESS: at 5.48% examples, 225565 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:31,303 : INFO : EPOCH 1 - PROGRESS: at 16.14% examples, 329298 words/s, in_qsize 4, out_qsize 1\n",
      "2020-03-16 19:47:32,306 : INFO : EPOCH 1 - PROGRESS: at 26.66% examples, 364754 words/s, in_qsize 6, out_qsize 0\n",
      "2020-03-16 19:47:33,327 : INFO : EPOCH 1 - PROGRESS: at 37.36% examples, 381911 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:34,334 : INFO : EPOCH 1 - PROGRESS: at 47.46% examples, 386971 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:35,342 : INFO : EPOCH 1 - PROGRESS: at 57.28% examples, 389064 words/s, in_qsize 6, out_qsize 0\n",
      "2020-03-16 19:47:36,348 : INFO : EPOCH 1 - PROGRESS: at 67.08% examples, 390548 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:37,360 : INFO : EPOCH 1 - PROGRESS: at 77.44% examples, 393470 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:38,371 : INFO : EPOCH 1 - PROGRESS: at 88.26% examples, 398475 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:39,392 : INFO : EPOCH 1 - PROGRESS: at 98.85% examples, 401398 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:39,508 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2020-03-16 19:47:39,517 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-03-16 19:47:39,519 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-03-16 19:47:39,520 : INFO : EPOCH - 1 : training on 5226851 raw words (4128882 effective words) took 10.3s, 402806 effective words/s\n",
      "2020-03-16 19:47:40,526 : INFO : EPOCH 2 - PROGRESS: at 10.77% examples, 447870 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:41,532 : INFO : EPOCH 2 - PROGRESS: at 21.22% examples, 438287 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:42,538 : INFO : EPOCH 2 - PROGRESS: at 31.97% examples, 440555 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:43,577 : INFO : EPOCH 2 - PROGRESS: at 43.44% examples, 442620 words/s, in_qsize 4, out_qsize 1\n",
      "2020-03-16 19:47:44,578 : INFO : EPOCH 2 - PROGRESS: at 53.75% examples, 439503 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:45,581 : INFO : EPOCH 2 - PROGRESS: at 63.61% examples, 432984 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:46,582 : INFO : EPOCH 2 - PROGRESS: at 73.41% examples, 427531 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:47,589 : INFO : EPOCH 2 - PROGRESS: at 83.15% examples, 424021 words/s, in_qsize 6, out_qsize 0\n",
      "2020-03-16 19:47:48,629 : INFO : EPOCH 2 - PROGRESS: at 91.67% examples, 413088 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:49,655 : INFO : EPOCH 2 - PROGRESS: at 99.67% examples, 405726 words/s, in_qsize 3, out_qsize 0\n",
      "2020-03-16 19:47:49,657 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2020-03-16 19:47:49,658 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-03-16 19:47:49,665 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-03-16 19:47:49,666 : INFO : EPOCH - 2 : training on 5226851 raw words (4128547 effective words) took 10.1s, 407044 effective words/s\n",
      "2020-03-16 19:47:50,740 : INFO : EPOCH 3 - PROGRESS: at 7.01% examples, 271928 words/s, in_qsize 6, out_qsize 0\n",
      "2020-03-16 19:47:51,775 : INFO : EPOCH 3 - PROGRESS: at 15.37% examples, 302770 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:52,801 : INFO : EPOCH 3 - PROGRESS: at 26.10% examples, 346060 words/s, in_qsize 4, out_qsize 1\n",
      "2020-03-16 19:47:53,805 : INFO : EPOCH 3 - PROGRESS: at 34.66% examples, 347680 words/s, in_qsize 6, out_qsize 0\n",
      "2020-03-16 19:47:54,812 : INFO : EPOCH 3 - PROGRESS: at 43.04% examples, 345735 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:55,853 : INFO : EPOCH 3 - PROGRESS: at 53.75% examples, 359085 words/s, in_qsize 3, out_qsize 2\n",
      "2020-03-16 19:47:56,858 : INFO : EPOCH 3 - PROGRESS: at 64.37% examples, 369149 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:47:57,883 : INFO : EPOCH 3 - PROGRESS: at 75.13% examples, 376009 words/s, in_qsize 5, out_qsize 0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-03-16 19:47:58,890 : INFO : EPOCH 3 - PROGRESS: at 84.90% examples, 378506 words/s, in_qsize 4, out_qsize 1\n",
      "2020-03-16 19:47:59,912 : INFO : EPOCH 3 - PROGRESS: at 95.95% examples, 384136 words/s, in_qsize 6, out_qsize 0\n",
      "2020-03-16 19:48:00,302 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2020-03-16 19:48:00,324 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-03-16 19:48:00,340 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-03-16 19:48:00,341 : INFO : EPOCH - 3 : training on 5226851 raw words (4128254 effective words) took 10.7s, 386774 effective words/s\n",
      "2020-03-16 19:48:01,350 : INFO : EPOCH 4 - PROGRESS: at 10.40% examples, 431383 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:02,396 : INFO : EPOCH 4 - PROGRESS: at 20.67% examples, 417830 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:03,421 : INFO : EPOCH 4 - PROGRESS: at 28.37% examples, 383159 words/s, in_qsize 4, out_qsize 1\n",
      "2020-03-16 19:48:04,466 : INFO : EPOCH 4 - PROGRESS: at 36.02% examples, 362594 words/s, in_qsize 6, out_qsize 0\n",
      "2020-03-16 19:48:05,483 : INFO : EPOCH 4 - PROGRESS: at 46.14% examples, 370666 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:06,486 : INFO : EPOCH 4 - PROGRESS: at 53.37% examples, 359137 words/s, in_qsize 4, out_qsize 1\n",
      "2020-03-16 19:48:07,493 : INFO : EPOCH 4 - PROGRESS: at 60.92% examples, 351542 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:08,495 : INFO : EPOCH 4 - PROGRESS: at 69.08% examples, 348825 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:09,507 : INFO : EPOCH 4 - PROGRESS: at 76.50% examples, 343146 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:10,552 : INFO : EPOCH 4 - PROGRESS: at 85.50% examples, 344356 words/s, in_qsize 4, out_qsize 1\n",
      "2020-03-16 19:48:11,563 : INFO : EPOCH 4 - PROGRESS: at 95.75% examples, 350153 words/s, in_qsize 4, out_qsize 1\n",
      "2020-03-16 19:48:11,987 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2020-03-16 19:48:12,019 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-03-16 19:48:12,024 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-03-16 19:48:12,025 : INFO : EPOCH - 4 : training on 5226851 raw words (4129182 effective words) took 11.7s, 353536 effective words/s\n",
      "2020-03-16 19:48:13,041 : INFO : EPOCH 5 - PROGRESS: at 10.03% examples, 411703 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:14,049 : INFO : EPOCH 5 - PROGRESS: at 17.10% examples, 350228 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:15,090 : INFO : EPOCH 5 - PROGRESS: at 26.66% examples, 361615 words/s, in_qsize 6, out_qsize 0\n",
      "2020-03-16 19:48:16,092 : INFO : EPOCH 5 - PROGRESS: at 36.97% examples, 377255 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:17,111 : INFO : EPOCH 5 - PROGRESS: at 47.09% examples, 382273 words/s, in_qsize 3, out_qsize 2\n",
      "2020-03-16 19:48:18,149 : INFO : EPOCH 5 - PROGRESS: at 58.48% examples, 393616 words/s, in_qsize 5, out_qsize 1\n",
      "2020-03-16 19:48:19,167 : INFO : EPOCH 5 - PROGRESS: at 69.08% examples, 398114 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:20,181 : INFO : EPOCH 5 - PROGRESS: at 80.04% examples, 403925 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:21,197 : INFO : EPOCH 5 - PROGRESS: at 89.88% examples, 402359 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:22,209 : INFO : EPOCH 5 - PROGRESS: at 98.34% examples, 395823 words/s, in_qsize 5, out_qsize 0\n",
      "2020-03-16 19:48:22,402 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2020-03-16 19:48:22,421 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2020-03-16 19:48:22,430 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2020-03-16 19:48:22,431 : INFO : EPOCH - 5 : training on 5226851 raw words (4128153 effective words) took 10.4s, 396783 effective words/s\n",
      "2020-03-16 19:48:22,432 : INFO : training on a 26134255 raw words (20643018 effective words) took 53.2s, 388219 effective words/s\n",
      "2020-03-16 19:48:22,459 : INFO : saving Word2Vec object under zztj.model, separately None\n",
      "2020-03-16 19:48:22,461 : INFO : storing np array 'vectors' to zztj.model.wv.vectors.npy\n",
      "2020-03-16 19:48:22,605 : INFO : not storing attribute vectors_norm\n",
      "2020-03-16 19:48:22,607 : INFO : storing np array 'syn1' to zztj.model.trainables.syn1.npy\n",
      "2020-03-16 19:48:22,831 : INFO : storing np array 'syn1neg' to zztj.model.trainables.syn1neg.npy\n",
      "2020-03-16 19:48:23,083 : INFO : not storing attribute cum_table\n",
      "2020-03-16 19:48:28,218 : INFO : saved zztj.model\n"
     ]
    }
   ],
   "source": [
    "# import modules & set up logging\n",
    "import logging\n",
    "import os\n",
    "from gensim.models import word2vec\n",
    "\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
    "\n",
    "sentences = word2vec.LineSentence('./zztj_segment.txt') \n",
    "\n",
    "model = word2vec.Word2Vec(sentences, hs=1,min_count=1,window=3,size=100)\n",
    "model.save('zztj.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = word2vec.Word2Vec.load('zztj.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "空之隆\n",
      "五帝\n",
      "场所\n",
      "迎气\n",
      "藩臣\n",
      "了然\n",
      "火星\n",
      "人帝\n",
      "设庭燎\n",
      "非至言\n",
      "宗庙\n",
      "正殿\n",
      "天地\n",
      "移市\n",
      "罪疑\n",
      "众神\n",
      "明堂\n",
      "譬若\n",
      "修身\n",
      "贵神\n",
      "求雨\n",
      "天神\n",
      "祭牲\n",
      "弘嫉\n",
      "研习\n",
      "大礼\n",
      "抱定\n",
      "祖宗\n",
      "五代\n",
      "上帝\n",
      "神灵\n",
      "形式\n",
      "运数\n",
      "土谷\n",
      "后裸献\n",
      "会稽山\n",
      "汤药\n",
      "盛放\n",
      "袒胸\n",
      "以高后\n",
      "昊天\n",
      "太庙\n",
      "相互竞争\n",
      "西郊\n",
      "姑衍山\n",
      "刘恭和乐\n",
      "庙\n",
      "始祖\n",
      "圜丘\n",
      "地神\n"
     ]
    }
   ],
   "source": [
    "req_count = 50\n",
    "for key in model.wv.similar_by_word('天帝', topn =100):\n",
    "    #if len(key[0])==2:\n",
    "        req_count -= 1\n",
    "        #print (key[0] ,key[1])\n",
    "        print (key[0])\n",
    "        if req_count == 0:\n",
    "            break;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
